Startup

library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   2.0.1     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(cowplot)
library(ggpmisc)
Loading required package: ggpp

Attaching package: ‘ggpp’

The following object is masked from ‘package:ggplot2’:

    annotate
library(here)
here() starts at /labs/khatrilab/solomonb/hla_project/hla_benchmark
source(here("helper_functions/data_import_functions.R"))
source(here("helper_functions/figure_format_functions.R"))
Loading required package: flextable
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio

Attaching package: ‘flextable’

The following object is masked from ‘package:purrr’:

    compose
source(here("helper_functions/calculation_functions.R"))
Loading required package: tidymodels
Registered S3 method overwritten by 'tune':
  method                   from   
  required_pkgs.model_spec parsnip
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.3 ──
✔ broom        0.7.9      ✔ rsample      0.1.0 
✔ dials        0.0.9      ✔ tune         0.1.6 
✔ infer        1.0.0      ✔ workflows    0.2.3 
✔ modeldata    0.1.1      ✔ workflowsets 0.1.0 
✔ parsnip      0.1.7      ✔ yardstick    0.0.8 
✔ recipes      0.1.16     
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
✖ ggpp::annotate()     masks ggplot2::annotate()
✖ flextable::compose() masks purrr::compose()
✖ scales::discard()    masks purrr::discard()
✖ dplyr::filter()      masks stats::filter()
✖ recipes::fixed()     masks stringr::fixed()
✖ dplyr::lag()         masks stats::lag()
✖ yardstick::spec()    masks readr::spec()
✖ recipes::step()      masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
Loading required package: checkmate

Attaching package: ‘checkmate’

The following object is masked from ‘package:recipes’:

    check_class

Loading required package: lubridate

Attaching package: ‘lubridate’

The following object is masked from ‘package:cowplot’:

    stamp

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
isb_path <- here("data/isb")
isb_samples <- read_tsv(here("data/isb/logs/parallel.log")) %>% 
  separate(Command, into = c(NA, "sample"), sep = " ") %>% 
  pull(sample) %>% unique()
Rows: 294 Columns: 9
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): Host, Command
dbl (7): Seq, Starttime, JobRuntime, Send, Receive, Exitval, Signal

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Get reference alleles

reference_alleles <- read_tsv(here("references/GCh38_reference_alleles.txt"), col_names = F) %>% pull(1)
Rows: 26 Columns: 1
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): X1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
genome_region_reference <- get_allele_length(reference_alleles)

Get arcasHLA alignment stats

arcas_log_dir <- sprintf("%s/arcasHLA", isb_path)
alignment_stats_df <- hla_mapping_stats_import(isb_samples, arcas_log_dir)

Import cell counts

cell_counts_df <- readRDS(here("data/isb/isb_sequenced_cell_count.RDS"))

Assemble accuracy DF

success_df <- readRDS(here("2_accuracy/isb_success.RDS"))
accuracy_df <- readRDS(here("3_DRB/isb_accuracy_drb345_filtered.RDS"))
accuracy_df <- accuracy_df %>% 
  left_join(
    success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(cell_counts_df, by = "sample") %>%
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)$", sample) & genotyper != "hlaminer") 

Coverage based plots

gg_coverage(accuracy_df, x_var = "coverage", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "coverage", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)

Cell number based plots

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)

Subsample READS

Import read subsample data

# Import predicted genotypes
subsample_reads_path <- here("data/isb/subsample")
subsample_reads_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_reads_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_reads_df <- format_hla_table(combine_HLA_import(path = subsample_reads_path, samples = subsample_reads_samples))
subsample_reads_df <- subsample_reads_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_reads_df
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
Rows: 1557 Columns: 9
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (8): Sample ID, Locus, Allele 1, Allele 2, Comments, Diploid Ambiguities, Allele 1 Ambiguities, Allele 2 Ambiguities
dbl (1): index

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
invitro_reads_df <- subsample_reads_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_reads_df
# Import read statistics
subsample_reads_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_reads_path)
subsample_reads_alignment_stats_df <- hla_mapping_stats_import(subsample_reads_samples, subsample_reads_arcas_log_dir)

Calculate accuracy

# Calculate accuracy
subsample_reads_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_reads_accuracy_df, "isb_subsample_reads_accuracy.RDS")
# Calculate success
subsample_reads_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_reads_success_df, "isb_subsample_reads_success.RDS")
# Combine various data
subsample_reads_combined_df <- subsample_reads_accuracy_df %>% 
  left_join(
    subsample_reads_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_reads_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  bind_rows(accuracy_df %>% select(-n_cells)) %>% # No cell numbers for read subsamples
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>%  # Calculate coverage
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%  # Calculate n_alleles
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
# saveRDS(subsample_reads_combined_df, "subsample_reads_combined_df.RDS")
# subsample_reads_combined_df <- readRDS("subsample_reads_combined_df.RDS")

Reads plots

gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

plt_read_accuracy_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "accuracy") 
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

plt_read_success_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "success") 
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)

plt_read_allele_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
  • Default arcasHLA cut off 75 reads (log10 = 1.8)
subsample_reads_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(y = n_alleles, x = log10(coverage)))+
  # geom_violin(scale = "count", alpha = 0.5, )+
  geom_jitter(size = 0.2, width = 0, height =0.2, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  # coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()

Subsample CELLS

# Import predicted genotypes
subsample_cells_path <- here("data/isb/subsample_cells")
subsample_cells_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_cells_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_cells_df <- format_hla_table(combine_HLA_import(path = subsample_cells_path, samples = subsample_cells_samples))
subsample_cells_df <- subsample_cells_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_cells_df
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
Rows: 1557 Columns: 9
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (8): Sample ID, Locus, Allele 1, Allele 2, Comments, Diploid Ambiguities, Allele 1 Ambiguities, Allele 2 Ambiguities
dbl (1): index

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
invitro_cells_df <- subsample_cells_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_cells_df
# Import read statistics
subsample_cells_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_cells_path)
subsample_cells_alignment_stats_df <- hla_mapping_stats_import(subsample_cells_samples, subsample_cells_arcas_log_dir)
# Import cell counts for each subsample
subsample_cell_counts_df <- read_csv(here("data/isb/subsample_cells/subsample_cell_counts.csv"))
Rows: 1176 Columns: 2
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): sample
dbl (1): n_cells

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Calculate accuracy

# Calculate accuracy
subsample_cells_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_cells_accuracy_df, "isb_subsample_cells_accuracy.RDS")
# subsample_cells_accuracy_df <- readRDS("isb_subsample_cells_accuracy.RDS")
# Calculate success
subsample_cells_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_cells_success_df, "isb_subsample_cells_success.RDS")
# subsample_cells_success_df <- readRDS("isb_subsample_cells_success.RDS")
# Combine various data
subsample_cells_combined_df <- subsample_cells_accuracy_df %>% 
  left_join(
    subsample_cells_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_cells_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(subsample_cell_counts_df, by = "sample") %>%
  bind_rows(accuracy_df) %>% 
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
# saveRDS(subsample_cells_combined_df, "subsample_cells_combined_df.RDS")

Coverage plots

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)

Cell number plots

gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

plt_cell_accuracy_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "accuracy") 
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
`geom_smooth()` using formula 'y ~ x'

plt_cell_success_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "success") 
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)

plt_cell_allele_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
# subsample_cells_combined_df %>% 
#   mutate(reads = reads/n_cells) %>% 
#   filter(reads < 100) %>% 
#   gg_coverage(x_var = "reads", y_var = "accuracy", x_log = F) 

Combined plots

no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_abc)
`geom_smooth()` using formula 'y ~ x'
plot_grid(
  plot_grid(
    plt_read_accuracy_abc + no_leg,
    plt_read_success_abc + no_leg,
    plt_read_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_abc + no_leg,
    plt_cell_success_abc + no_leg,
    plt_cell_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_all)
`geom_smooth()` using formula 'y ~ x'
plot_grid(
  plot_grid(
    plt_read_accuracy_all + no_leg,
    plt_read_success_all + no_leg,
    plt_read_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_all + no_leg,
    plt_cell_success_all + no_leg,
    plt_cell_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

Main plot

loci <- c("A", "B", "C", "DPB1", "DQA1", "DRB1")
plt_read_accuracy_subset <- gg_coverage(
  subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), 
  x_var = "reads", y_var = "accuracy") +
  facet_wrap(~locus, nrow = 2)
plt_read_accuracy_subset
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 59 rows containing non-finite values (stat_smooth).
Warning: Removed 59 rows containing missing values (geom_point).

save_plot(here("figures_pdf/v2/5_coverage.pdf"), plt_read_accuracy_subset, base_height = 4, base_width = 6)
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all
Warning: Removed 62 rows containing missing values (geom_point).

plt_main <- plot_grid(
  plt_read_accuracy_subset,
  plt_read_allele_all,
  ncol = 1,
  rel_heights = c(3,2),
  align = "v",
  axis = "lr",
  labels = c("A", "B")
)
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 59 rows containing non-finite values (stat_smooth).
Warning: Removed 59 rows containing missing values (geom_point).
Warning: Removed 62 rows containing missing values (geom_point).
plt_main

save_plot(here("figures_pdf/v2/5_coverage.pdf"), plt_main, base_height = 7, base_width = 7)

Slope stats

get_lm_stats <- function(df, x_var = "coverage", y_var = "accuracy", field_var = "field_2", x_log = T){
  # Input checks
  arg_col <- makeAssertCollection()
  assertChoice(y_var, c("accuracy", "success", "n_alleles"), add = arg_col)
  assertChoice(x_var, c("reads", "coverage", "n_cells"), add = arg_col)
  assertChoice(field_var, c("field_1", "field_2", "field_3"), add = arg_col)
  assertLogical(x_log, add = arg_col)
  if (arg_col$isEmpty()==F) {map(arg_col$getMessages(),print);reportAssertions(arg_col)}
  
  # browser()
  df <- df %>% 
    filter(field == field_var) %>% 
    exclude_genotyper_fields() # Replace w/ exclude_genotyper_fields?
  if (x_log ==T){df <- df %>% mutate_at(x_var, log10)}
  df %>%
  filter_at(x_var, function(x) !is.na(x) & !is.infinite(x)) %>% 
  group_by(locus, field, genotyper) %>%
  nest() %>%
  mutate(data = map(data, function(x){
    summary(lm(!!sym(y_var) ~ !!sym(x_var), data = x)) %>% broom::tidy() %>% filter(term == x_var)
  })) %>% unnest_wider(data) %>%
  mutate(est_min = round(estimate - 2*std.error, 2),
         est_max = round(estimate + 2*std.error, 2))
}

model_stats <- get_lm_stats(subsample_cells_combined_df, x_var = "reads", y_var = "accuracy", x_log = T)
model_stats
model_stats <- bind_rows(
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy", x_log = T) %>% mutate(x_var = "reads", y_var = "accuracy"),
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "success", x_log = T) %>% mutate(x_var = "reads", y_var = "success"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy", x_log = T) %>% mutate(x_var = "cells", y_var = "accuracy"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "success", x_log = T) %>% mutate(x_var = "cells", y_var = "success")
)  %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") 

Table

Scratch work

---
title: "5) Coverage Analysis"
output: 
  html_notebook:
    toc: true
    toc_depth: 3
    toc_float: true
    code_folding: hide
---


# Startup 

```{r, message = F}
library(tidyverse)
library(cowplot)
library(ggpmisc)
library(here)

source(here("helper_functions/data_import_functions.R"))
source(here("helper_functions/figure_format_functions.R"))
source(here("helper_functions/calculation_functions.R"))
isb_path <- here("data/isb")
isb_samples <- read_tsv(here("data/isb/logs/parallel.log")) %>% 
  separate(Command, into = c(NA, "sample"), sep = " ") %>% 
  pull(sample) %>% unique()
```

### Get reference alleles

- GCh38 reference alleles (per https://www.ebi.ac.uk/ipd/imgt/hla/help/genomics.html)
- To be used to calculate coverage

```{r, message = F}
reference_alleles <- read_tsv(here("references/GCh38_reference_alleles.txt"), col_names = F) %>% pull(1)
genome_region_reference <- get_allele_length(reference_alleles)
```

### Get arcasHLA alignment stats

```{r}
arcas_log_dir <- sprintf("%s/arcasHLA", isb_path)
alignment_stats_df <- hla_mapping_stats_import(isb_samples, arcas_log_dir)
```

### Import cell counts

```{r}
cell_counts_df <- readRDS(here("data/isb/isb_sequenced_cell_count.RDS"))
```

### Assemble accuracy DF

```{r}
success_df <- readRDS(here("2_accuracy/isb_success.RDS"))
accuracy_df <- readRDS(here("3_DRB/isb_accuracy_drb345_filtered.RDS"))
accuracy_df <- accuracy_df %>% 
  left_join(
    success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(cell_counts_df, by = "sample") %>%
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)$", sample) & genotyper != "hlaminer") 
```


### Coverage based plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)
```

### Cell number based plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
```


# Subsample READS

### Import read subsample data

```{r}
# Import predicted genotypes
subsample_reads_path <- here("data/isb/subsample")
subsample_reads_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_reads_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_reads_df <- format_hla_table(combine_HLA_import(path = subsample_reads_path, samples = subsample_reads_samples))
subsample_reads_df <- subsample_reads_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_reads_df
```

```{r, message = F}
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
invitro_reads_df <- subsample_reads_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_reads_df
```

```{r}
# Import read statistics
subsample_reads_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_reads_path)
subsample_reads_alignment_stats_df <- hla_mapping_stats_import(subsample_reads_samples, subsample_reads_arcas_log_dir)
```

### Calculate accuracy

```{r}
# Calculate accuracy
subsample_reads_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_reads_accuracy_df, here("5_coverage_accuracy/isb_subsample_reads_accuracy.RDS"))
```


```{r}
# Calculate success
subsample_reads_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_reads_success_df, here("5_coverage_accuracy/isb_subsample_reads_success.RDS"))
```

```{r}
# Combine various data
subsample_reads_combined_df <- subsample_reads_accuracy_df %>% 
  left_join(
    subsample_reads_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_reads_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  bind_rows(accuracy_df %>% select(-n_cells)) %>% # No cell numbers for read subsamples
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>%  # Calculate coverage
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%  # Calculate n_alleles
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
# saveRDS(subsample_reads_combined_df, here("5_coverage_accuracy/subsample_reads_combined_df.RDS"))
# subsample_reads_combined_df <- readRDS(here("5_coverage_accuracy/subsample_reads_combined_df.RDS"))
```

### Reads plots
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "success") 
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
```
- Default arcasHLA cut off 75 reads (log10 = 1.8)

```{r, fig.width=12, fig.height=3, warning = F, message = F}
subsample_reads_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(y = n_alleles, x = log10(coverage)))+
  # geom_violin(scale = "count", alpha = 0.5, )+
  geom_jitter(size = 0.2, width = 0, height =0.2, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  # coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()
```

# Subsample CELLS

```{r}
# Import predicted genotypes
subsample_cells_path <- here("data/isb/subsample_cells")
subsample_cells_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_cells_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_cells_df <- format_hla_table(combine_HLA_import(path = subsample_cells_path, samples = subsample_cells_samples))
subsample_cells_df <- subsample_cells_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_cells_df
```


```{r, message = F}
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
invitro_cells_df <- subsample_cells_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_cells_df
```

```{r}
# Import read statistics
subsample_cells_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_cells_path)
subsample_cells_alignment_stats_df <- hla_mapping_stats_import(subsample_cells_samples, subsample_cells_arcas_log_dir)
```

```{r}
# Import cell counts for each subsample
subsample_cell_counts_df <- read_csv(here("data/isb/subsample_cells/subsample_cell_counts.csv"))
```

### Calculate accuracy

```{r}
# Calculate accuracy
subsample_cells_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_cells_accuracy_df, here("5_coverage_accuracy/isb_subsample_cells_accuracy.RDS"))
# subsample_cells_accuracy_df <- readRDS(here("5_coverage_accuracy/isb_subsample_cells_accuracy.RDS"))
```

```{r}
# Calculate success
subsample_cells_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_cells_success_df, here("5_coverage_accuracy/isb_subsample_cells_success.RDS"))
# subsample_cells_success_df <- readRDS(here("5_coverage_accuracy/isb_subsample_cells_success.RDS"))
```

```{r}
# Combine various data
subsample_cells_combined_df <- subsample_cells_accuracy_df %>% 
  left_join(
    subsample_cells_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_cells_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(subsample_cell_counts_df, by = "sample") %>%
  bind_rows(accuracy_df) %>% 
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
# saveRDS(subsample_cells_combined_df, here("5_coverage_accuracy/subsample_cells_combined_df.RDS"))
```

### Coverage plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)
```

### Cell number plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "success") 
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
```


```{r, fig.width=12, fig.height=3, warning = F, message = F}
# subsample_cells_combined_df %>% 
#   mutate(reads = reads/n_cells) %>% 
#   filter(reads < 100) %>% 
#   gg_coverage(x_var = "reads", y_var = "accuracy", x_log = F) 
```








# Combined plots
```{r, fig.width = 9, fig.height = 7, message = F, warning = F}
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_abc)
plot_grid(
  plot_grid(
    plt_read_accuracy_abc + no_leg,
    plt_read_success_abc + no_leg,
    plt_read_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_abc + no_leg,
    plt_cell_success_abc + no_leg,
    plt_cell_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
```

```{r, fig.width = 18, fig.height = 7, message = F, warning = F}
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_all)
plot_grid(
  plot_grid(
    plt_read_accuracy_all + no_leg,
    plt_read_success_all + no_leg,
    plt_read_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_all + no_leg,
    plt_cell_success_all + no_leg,
    plt_cell_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
```

### Main plot

```{r}
loci <- c("A", "B", "C", "DPB1", "DQA1", "DRB1")
plt_read_accuracy_subset <- gg_coverage(
  subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), 
  x_var = "reads", y_var = "accuracy") +
  facet_wrap(~locus, nrow = 2)
plt_read_accuracy_subset
```
```{r}
save_plot(here("figures_pdf/v2/5_coverage.pdf"), plt_read_accuracy_subset, base_height = 4, base_width = 6)
```

```{r}
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all
```
```{r, fig.width = 7, fig.height=7}
plt_main <- plot_grid(
  plt_read_accuracy_subset,
  plt_read_allele_all,
  ncol = 1,
  rel_heights = c(3,2),
  align = "v",
  axis = "lr",
  labels = c("A", "B")
)
plt_main
```
```{r}
save_plot(here("figures_pdf/v2/5_coverage.pdf"), plt_main, base_height = 7, base_width = 7)
```

# Slope stats

```{r}
get_lm_stats <- function(df, x_var = "coverage", y_var = "accuracy", field_var = "field_2", x_log = T){
  # Input checks
  arg_col <- makeAssertCollection()
  assertChoice(y_var, c("accuracy", "success", "n_alleles"), add = arg_col)
  assertChoice(x_var, c("reads", "coverage", "n_cells"), add = arg_col)
  assertChoice(field_var, c("field_1", "field_2", "field_3"), add = arg_col)
  assertLogical(x_log, add = arg_col)
  if (arg_col$isEmpty()==F) {map(arg_col$getMessages(),print);reportAssertions(arg_col)}
  
  # browser()
  df <- df %>% 
    filter(field == field_var) %>% 
    exclude_genotyper_fields() # Replace w/ exclude_genotyper_fields?
  if (x_log ==T){df <- df %>% mutate_at(x_var, log10)}
  df %>%
  filter_at(x_var, function(x) !is.na(x) & !is.infinite(x)) %>% 
  group_by(locus, field, genotyper) %>%
  nest() %>%
  mutate(data = map(data, function(x){
    summary(lm(!!sym(y_var) ~ !!sym(x_var), data = x)) %>% broom::tidy() %>% filter(term == x_var)
  })) %>% unnest_wider(data) %>%
  mutate(est_min = round(estimate - 2*std.error, 2),
         est_max = round(estimate + 2*std.error, 2))
}

model_stats <- get_lm_stats(subsample_cells_combined_df, x_var = "reads", y_var = "accuracy", x_log = T)
model_stats
```

```{r}
model_stats <- bind_rows(
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy", x_log = T) %>% mutate(x_var = "reads", y_var = "accuracy"),
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "success", x_log = T) %>% mutate(x_var = "reads", y_var = "success"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy", x_log = T) %>% mutate(x_var = "cells", y_var = "accuracy"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "success", x_log = T) %>% mutate(x_var = "cells", y_var = "success")
)  %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") 
```



# Table
```{r}
col_vars <- c("x_var", "locus")
row_vars <- c("y_var", "genotyper")

flex_df <- model_stats %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") %>% 
  mutate(CI = sprintf("(%s) - (%s)", est_min, est_max)) %>% 
  mutate(genotyper = reformat_hla_genotyper(genotyper)) %>% 
  select(genotyper, locus, CI, x_var, y_var) %>% 
  pivot_wider(names_from = col_vars, values_from = "CI", names_sep =  "-") 

df_key <- tibble(col_keys = names(flex_df)) %>% 
  filter(!(col_keys %in% c(col_vars, row_vars))) %>% 
  separate(col_keys, into = col_vars, sep = "-", remove = F) %>%
  mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
  arrange_at(col_vars) %>% 
  mutate_all(as.character) %>% 
  mutate_at(col_vars[!(col_vars %in% "locus")], str_to_sentence)

output_table <- flex_df %>% 
  select(row_vars, everything()) %>% 
  mutate_at(row_vars[!(row_vars %in% "genotyper")], str_to_sentence) %>% 
  # select(locus, df_key$col_keys) %>% 
  flextable() %>% 
  colformat_char(na_str = "---") %>%
  merge_v(j=1:2) %>% 
  set_header_df(mapping = df_key, key = "col_keys") %>% 
  merge_h(part = "header") %>%
  theme_vanilla() %>% 
  # vline(j=vlines_sequence, border = fp_border_default()) %>%
  fix_border_issues() %>% 
  autofit() %>% 
  fontsize(size = 8, part = "all") 

print(output_table)
# output_table %>% print(preview = "pptx")
```


<!-- ```{r} -->
<!--   df <- df %>%  -->
<!--     mutate(field = reformat_hla_field(field)) %>%  -->
<!--     mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>%  -->
<!--     mutate(cell_value = ifelse(!is.na(se),  -->
<!--                                sprintf("%s %s %s", round(mean_accuracy,2),"\u00B1",round(se,2)), -->
<!--                                sprintf("%s", round(mean_accuracy,2)))) %>% -->
<!--     mutate(cell_value = ifelse(grepl("NA", cell_value), NA, cell_value)) %>%  -->
<!--     select(-sd,-se, -mean_accuracy) %>%  -->
<!--     pivot_wider(names_from = nesting_vars, values_from = "cell_value", names_sep = "-")  -->

<!--   df_key <- suppressWarnings({tibble(col_keys = names(df)) %>%  -->
<!--       filter(!grepl("locus", col_keys)) %>%  -->
<!--       separate(col_keys, into = nesting_vars, sep = "-", remove = F) %>% -->
<!--       mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>%  -->
<!--       arrange_at(nesting_vars) %>%  -->
<!--       mutate_all(as.character) -->
<!--   }) -->

<!--   vline_var <- tail(nesting_vars,1) -->
<!--   vlines_sequence <- seq(1,nrow(df_key),by = length(unique(df_key[[vline_var]]))) -->
<!--   df <- df %>%  -->
<!--     select(locus, df_key$col_keys) %>%  -->
<!--     flextable() %>%  -->
<!--     colformat_char(na_str = "---") %>%  -->
<!--     set_header_df(mapping = df_key, key = "col_keys") %>%  -->
<!--     merge_h(part = "header") %>%  -->
<!--     theme_vanilla() %>%  -->
<!--     vline(j=vlines_sequence, border = fp_border_default()) %>% -->
<!--     fix_border_issues() -->
<!--   return(df) -->
<!-- ``` -->





# Scratch work

<!-- ```{r, fig.width=12, fig.height=3, warning = F, message = F} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   filter(field == "field_2") %>%  -->
<!--   mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>%  -->
<!--   ggplot(aes(x = n_alleles, y = log10(n_cells)))+ -->
<!--   geom_violin(scale = "width")+ -->
<!--   geom_jitter(size = 0.2, height = 0, width =0.05, alpha = 0.2)+ -->
<!--   # geom_boxplot(width=0.1)+ -->
<!--   # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+ -->
<!--   coord_flip()+ -->
<!--   facet_grid(genotyper ~ locus, scales = "free")+ -->
<!--   theme_bw() -->
<!-- ``` -->
<!-- # Coverage stats -->
<!-- ```{r, fig.width=12, fig.height=3, warning = F, message = F} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   ggplot(aes(x = n_cells, y = coverage, color = genotyper)) +  -->
<!--   geom_point() + -->
<!--   facet_wrap(genotyper ~ locus, scales = "free", nrow = 3) +  -->
<!--   theme_bw() + -->
<!--   theme(strip.text = element_blank(), -->
<!--         axis.text.x = element_text(angle = 45, hjust = 1))+ -->
<!--   scale_color_brewer(palette = "Dark2") -->
<!-- ``` -->

<!-- ```{r, fig.width=16, fig.height=3, warning = F, message = F} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   select(-genotyper) %>% distinct() %>%  -->
<!--   mutate(cov_per_cell = coverage/n_cells) %>%  -->
<!--   ggplot(aes(x = log10(n_cells), y = cov_per_cell)) +  -->
<!--   geom_point(color = "black", alpha = 0.1) + -->
<!--   stat_smooth(method = "lm")+ -->
<!--   # stat_smooth()+ -->
<!--     facet_wrap( ~ locus, scales = "free", nrow = 1) + -->
<!--   theme_bw() + -->
<!--   theme(axis.text.x = element_text(angle = 45, hjust = 1))+ -->
<!--   scale_color_brewer(palette = "Dark2") -->
<!-- ``` -->

<!-- ```{r, fig.width=16, fig.height=6, warning = F, message = F} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   select(-genotyper) %>% distinct() %>%  -->
<!--   filter(!is.na(n_cells)) %>%  -->
<!--   mutate(cov_per_cell = coverage/n_cells) %>%  -->
<!--   mutate(n_cells = cut(n_cells, seq(0,10000,by=2500), right = F)) %>%  -->
<!--   ggplot(aes(x = n_cells, y = cov_per_cell)) +  -->
<!--   stat_summary(fun = function(x) mean(x,na.rm=T), geom = "bar") + -->
<!--   # geom_point(color = "black", alpha = 0.1) + -->
<!--   # stat_smooth(method = "lm")+ -->
<!--   # stat_smooth()+ -->
<!--   facet_wrap( ~ locus, scales = "free", nrow = 1) + -->
<!--   theme_bw() + -->
<!--   theme(axis.text.x = element_text(angle = 45, hjust = 1))+ -->
<!--   scale_color_brewer(palette = "Dark2") -->
<!-- ``` -->

<!-- ```{r} -->
<!-- hist(cut(1:100, seq(1,100,10))) -->
<!-- ``` -->

<!-- ```{r} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   ungroup() %>%  -->
<!--   count(genotyper, locus, field) -->
<!-- ``` -->
<!-- ```{r} -->
<!-- subsample_reads_combined_df %>%  -->
<!--   ungroup() %>%  -->
<!--   count(genotyper, locus, field) -->
<!-- ``` -->
